/*
 * Copyright 2020 Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "si_shader_internal.h"
#include "si_pipe.h"
#include "sid.h"

static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
{
	switch (ctx->type) {
	case PIPE_SHADER_TESS_CTRL:
		return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);

	case PIPE_SHADER_TESS_EVAL:
		return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);

	default:
		assert(0);
		return NULL;
	}
}

/* Tessellation shaders pass outputs to the next shader using LDS.
 *
 * LS outputs = TCS inputs
 * TCS outputs = TES inputs
 *
 * The LDS layout is:
 * - TCS inputs for patch 0
 * - TCS inputs for patch 1
 * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
 * - ...
 * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 * - TCS outputs for patch 1
 * - Per-patch TCS outputs for patch 1
 * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 * - ...
 *
 * All three shaders VS(LS), TCS, TES share the same LDS space.
 */

static LLVMValueRef
get_tcs_in_patch_stride(struct si_shader_context *ctx)
{
	return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
}

static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
{
	assert(ctx->type == PIPE_SHADER_TESS_CTRL);

	if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
		return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;

	return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
}

static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
{
	unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);

	return LLVMConstInt(ctx->ac.i32, stride, 0);
}

static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
{
	if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
		return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);

	const struct si_shader_info *info = &ctx->shader->selector->info;
	unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
	unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
	unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
	unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
				   num_patch_outputs * 4;
	return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
}

static LLVMValueRef
get_tcs_out_patch0_offset(struct si_shader_context *ctx)
{
	return LLVMBuildMul(ctx->ac.builder,
			    si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
			    LLVMConstInt(ctx->ac.i32, 4, 0), "");
}

static LLVMValueRef
get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
{
	return LLVMBuildMul(ctx->ac.builder,
			    si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
			    LLVMConstInt(ctx->ac.i32, 4, 0), "");
}

static LLVMValueRef
get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
{
	LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);

	return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
}

static LLVMValueRef
get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
{
	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);

	return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
}

static LLVMValueRef
get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
{
	LLVMValueRef patch0_patch_data_offset =
		get_tcs_out_patch0_patch_data_offset(ctx);
	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);

	return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
}

static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
{
	unsigned tcs_out_vertices =
		ctx->shader->selector ?
		ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;

	/* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
	if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
		return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);

	return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
}

static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
{
	unsigned stride;

	switch (ctx->type) {
	case PIPE_SHADER_VERTEX:
		stride = ctx->shader->selector->lshs_vertex_stride / 4;
		return LLVMConstInt(ctx->ac.i32, stride, 0);

	case PIPE_SHADER_TESS_CTRL:
		if (ctx->screen->info.chip_class >= GFX9 &&
		    ctx->shader->is_monolithic) {
			stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
			return LLVMConstInt(ctx->ac.i32, stride, 0);
		}
		return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);

	default:
		assert(0);
		return NULL;
	}
}

static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
							LLVMValueRef vertex_dw_stride,
							LLVMValueRef base_addr,
							LLVMValueRef vertex_index,
							LLVMValueRef param_index,
							ubyte name, ubyte index)
{
	if (vertex_dw_stride) {
		base_addr = ac_build_imad(&ctx->ac, vertex_index,
					  vertex_dw_stride, base_addr);
	}

	if (param_index) {
		base_addr = ac_build_imad(&ctx->ac, param_index,
					  LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
	}

	int param = name == TGSI_SEMANTIC_PATCH ||
		    name == TGSI_SEMANTIC_TESSINNER ||
		    name == TGSI_SEMANTIC_TESSOUTER ?
		si_shader_io_get_unique_index_patch(name, index) :
		si_shader_io_get_unique_index(name, index, false);

	/* Add the base address of the element. */
	return LLVMBuildAdd(ctx->ac.builder, base_addr,
			    LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
}

/* The offchip buffer layout for TCS->TES is
 *
 * - attribute 0 of patch 0 vertex 0
 * - attribute 0 of patch 0 vertex 1
 * - attribute 0 of patch 0 vertex 2
 *   ...
 * - attribute 0 of patch 1 vertex 0
 * - attribute 0 of patch 1 vertex 1
 *   ...
 * - attribute 1 of patch 0 vertex 0
 * - attribute 1 of patch 0 vertex 1
 *   ...
 * - per patch attribute 0 of patch 0
 * - per patch attribute 0 of patch 1
 *   ...
 *
 * Note that every attribute has 4 components.
 */
static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
					       LLVMValueRef rel_patch_id,
                                               LLVMValueRef vertex_index,
                                               LLVMValueRef param_index)
{
	LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
	LLVMValueRef param_stride, constant16;

	vertices_per_patch = get_num_tcs_out_vertices(ctx);
	num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
	total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
	                              num_patches, "");

	constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
	if (vertex_index) {
		base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
					  vertices_per_patch, vertex_index);
		param_stride = total_vertices;
	} else {
		base_addr = rel_patch_id;
		param_stride = num_patches;
	}

	base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
	base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");

	if (!vertex_index) {
		LLVMValueRef patch_data_offset =
		           si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);

		base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
		                         patch_data_offset, "");
	}
	return base_addr;
}

static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
					struct si_shader_context *ctx,
					LLVMValueRef vertex_index,
					LLVMValueRef param_index,
					ubyte name, ubyte index)
{
	unsigned param_index_base;

	param_index_base = name == TGSI_SEMANTIC_PATCH ||
			   name == TGSI_SEMANTIC_TESSINNER ||
			   name == TGSI_SEMANTIC_TESSOUTER ?
		si_shader_io_get_unique_index_patch(name, index) :
		si_shader_io_get_unique_index(name, index, false);

	if (param_index) {
		param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
					   LLVMConstInt(ctx->ac.i32, param_index_base, 0),
					   "");
	} else {
		param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
	}

	return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
					  vertex_index, param_index);
}

static LLVMValueRef buffer_load(struct si_shader_context *ctx,
                                LLVMTypeRef type, unsigned swizzle,
                                LLVMValueRef buffer, LLVMValueRef offset,
                                LLVMValueRef base, bool can_speculate)
{
	LLVMValueRef value, value2;
	LLVMTypeRef vec_type = LLVMVectorType(type, 4);

	if (swizzle == ~0) {
		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
					     0, ac_glc, can_speculate, false);

		return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
	}

	if (ac_get_type_size(type) != 8) {
		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
					     0, ac_glc, can_speculate, false);

		value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
		return LLVMBuildExtractElement(ctx->ac.builder, value,
		                    LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
	}

	value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
	                          swizzle * 4, ac_glc, can_speculate, false);

	value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
	                           swizzle * 4 + 4, ac_glc, can_speculate, false);

	return si_build_gather_64bit(ctx, type, value, value2);
}

/**
 * Load from LSHS LDS storage.
 *
 * \param type		output value type
 * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
 * \param dw_addr	address in dwords
 */
static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
				  LLVMTypeRef type, unsigned swizzle,
				  LLVMValueRef dw_addr)
{
	LLVMValueRef value;

	if (swizzle == ~0) {
		LLVMValueRef values[4];

		for (unsigned chan = 0; chan < 4; chan++)
			values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);

		return ac_build_gather_values(&ctx->ac, values, 4);
	}

	/* Split 64-bit loads. */
	if (ac_get_type_size(type) == 8) {
		LLVMValueRef lo, hi;

		lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr);
		hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr);
		return si_build_gather_64bit(ctx, type, lo, hi);
	}

	dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
			       LLVMConstInt(ctx->ac.i32, swizzle, 0), "");

	value = ac_lds_load(&ctx->ac, dw_addr);

	return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
}

/**
 * Store to LSHS LDS storage.
 *
 * \param swizzle	offset (typically 0..3)
 * \param dw_addr	address in dwords
 * \param value		value to store
 */
static void lshs_lds_store(struct si_shader_context *ctx,
		      unsigned dw_offset_imm, LLVMValueRef dw_addr,
		      LLVMValueRef value)
{
	dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
			       LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");

	ac_lds_store(&ctx->ac, dw_addr, value);
}

enum si_tess_ring {
	TCS_FACTOR_RING,
	TESS_OFFCHIP_RING_TCS,
	TESS_OFFCHIP_RING_TES,
};

static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
					     enum si_tess_ring ring)
{
	LLVMBuilderRef builder = ctx->ac.builder;
	LLVMValueRef addr = ac_get_arg(&ctx->ac,
				       ring == TESS_OFFCHIP_RING_TES ?
				       ctx->tes_offchip_addr :
				       ctx->tcs_out_lds_layout);

	/* TCS only receives high 13 bits of the address. */
	if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
		addr = LLVMBuildAnd(builder, addr,
				    LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
	}

	if (ring == TCS_FACTOR_RING) {
		unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
		addr = LLVMBuildAdd(builder, addr,
				    LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
	}

	uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
			 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
			 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
			 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);

	if (ctx->screen->info.chip_class >= GFX10)
		rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
			 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
			 S_008F0C_RESOURCE_LEVEL(1);
	else
		rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
			 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);

	LLVMValueRef desc[4];
	desc[0] = addr;
	desc[1] = LLVMConstInt(ctx->ac.i32,
			       S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
	desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
	desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);

	return ac_build_gather_values(&ctx->ac, desc, 4);
}

void si_llvm_preload_tes_rings(struct si_shader_context *ctx)
{
	ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
}

static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
					     LLVMTypeRef type,
					     LLVMValueRef vertex_index,
					     LLVMValueRef param_index,
					     unsigned const_index,
					     unsigned location,
					     unsigned driver_location,
					     unsigned component,
					     unsigned num_components,
					     bool is_patch,
					     bool is_compact,
					     bool load_input)
{
	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
	struct si_shader_info *info = &ctx->shader->selector->info;
	LLVMValueRef dw_addr, stride;
	ubyte name, index;

	driver_location = driver_location / 4;

	if (load_input) {
		name = info->input_semantic_name[driver_location];
		index = info->input_semantic_index[driver_location];
	} else {
		name = info->output_semantic_name[driver_location];
		index = info->output_semantic_index[driver_location];
	}

	assert((name == TGSI_SEMANTIC_PATCH ||
		name == TGSI_SEMANTIC_TESSINNER ||
		name == TGSI_SEMANTIC_TESSOUTER) == is_patch);

	if (load_input) {
		stride = get_tcs_in_vertex_dw_stride(ctx);
		dw_addr = get_tcs_in_current_patch_offset(ctx);
	} else {
		if (is_patch) {
			stride = NULL;
			dw_addr = get_tcs_out_current_patch_data_offset(ctx);
		} else {
			stride = get_tcs_out_vertex_dw_stride(ctx);
			dw_addr = get_tcs_out_current_patch_offset(ctx);
		}
	}

	if (!param_index) {
		param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
	}

	dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
						      vertex_index, param_index,
						      name, index);

	LLVMValueRef value[4];
	for (unsigned i = 0; i < num_components; i++) {
		unsigned offset = i;
		if (ac_get_type_size(type) == 8)
			offset *= 2;

		offset += component;
		value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
	}

	return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
}

static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
					  LLVMTypeRef type,
					  LLVMValueRef vertex_index,
					  LLVMValueRef param_index,
					  unsigned const_index,
					  unsigned location,
					  unsigned driver_location,
					  unsigned component,
					  unsigned num_components,
					  bool is_patch,
					  bool is_compact,
					  bool load_input)
{
	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
	struct si_shader_info *info = &ctx->shader->selector->info;
	LLVMValueRef base, addr;

	driver_location = driver_location / 4;
	ubyte name = info->input_semantic_name[driver_location];
	ubyte index = info->input_semantic_index[driver_location];

	assert((name == TGSI_SEMANTIC_PATCH ||
		name == TGSI_SEMANTIC_TESSINNER ||
		name == TGSI_SEMANTIC_TESSOUTER) == is_patch);

	base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);

	if (!param_index) {
		param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
	}

	addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
							       param_index,
							       name, index);

	/* TODO: This will generate rather ordinary llvm code, although it
	 * should be easy for the optimiser to fix up. In future we might want
	 * to refactor buffer_load().
	 */
	LLVMValueRef value[4];
	for (unsigned i = 0; i < num_components; i++) {
		unsigned offset = i;
		if (ac_get_type_size(type) == 8) {
			offset *= 2;
			if (offset == 4) {
				ubyte name = info->input_semantic_name[driver_location + 1];
				ubyte index = info->input_semantic_index[driver_location + 1];
                                addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
                                                                                       vertex_index,
                                                                                       param_index,
										       name, index);
			}

                        offset = offset % 4;
		}

		offset += component;
		value[i + component] = buffer_load(ctx, type, offset,
						   ctx->tess_offchip_ring, base, addr, true);
	}

	return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
}

static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
				    const struct nir_variable *var,
				    LLVMValueRef vertex_index,
				    LLVMValueRef param_index,
				    unsigned const_index,
				    LLVMValueRef src,
				    unsigned writemask)
{
	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
	struct si_shader_info *info = &ctx->shader->selector->info;
	const unsigned component = var->data.location_frac;
	unsigned driver_location = var->data.driver_location;
	LLVMValueRef dw_addr, stride;
	LLVMValueRef buffer, base, addr;
	LLVMValueRef values[8];
	bool skip_lds_store;
	bool is_tess_factor = false, is_tess_inner = false;

	driver_location = driver_location / 4;
	ubyte name = info->output_semantic_name[driver_location];
	ubyte index = info->output_semantic_index[driver_location];

	bool is_const = !param_index;
	if (!param_index)
		param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);

	const bool is_patch = var->data.patch ||
			      var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
			      var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;

	/* Invalid SPIR-V can cause this. */
	if ((name == TGSI_SEMANTIC_PATCH ||
	     name == TGSI_SEMANTIC_TESSINNER ||
	     name == TGSI_SEMANTIC_TESSOUTER) != is_patch)
		return;

	if (!is_patch) {
		stride = get_tcs_out_vertex_dw_stride(ctx);
		dw_addr = get_tcs_out_current_patch_offset(ctx);
		dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
							      vertex_index, param_index,
							      name, index);

		skip_lds_store = !info->reads_pervertex_outputs;
	} else {
		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
		dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
							      vertex_index, param_index,
							      name, index);

		skip_lds_store = !info->reads_perpatch_outputs;

		if (is_const && const_index == 0) {
			int name = info->output_semantic_name[driver_location];

			/* Always write tess factors into LDS for the TCS epilog. */
			if (name == TGSI_SEMANTIC_TESSINNER ||
			    name == TGSI_SEMANTIC_TESSOUTER) {
				/* The epilog doesn't read LDS if invocation 0 defines tess factors. */
				skip_lds_store = !info->reads_tessfactor_outputs &&
						 ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
				is_tess_factor = true;
				is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
			}
		}
	}

	buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);

	base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);

	addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
							       param_index, name, index);

	for (unsigned chan = component; chan < 8; chan++) {
		if (!(writemask & (1 << chan)))
			continue;
		LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);

		unsigned buffer_store_offset = chan % 4;
		if (chan == 4) {
			ubyte name = info->output_semantic_name[driver_location + 1];
			ubyte index = info->output_semantic_index[driver_location + 1];
                        addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
                                                                               vertex_index,
                                                                               param_index,
									       name, index);
		}

		/* Skip LDS stores if there is no LDS read of this output. */
		if (!skip_lds_store)
			lshs_lds_store(ctx, chan, dw_addr, value);

		value = ac_to_integer(&ctx->ac, value);
		values[chan] = value;

		if (writemask != 0xF && !is_tess_factor) {
			ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
						    addr, base,
						    4 * buffer_store_offset,
                                                    ac_glc);
		}

		/* Write tess factors into VGPRs for the epilog. */
		if (is_tess_factor &&
		    ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
			if (!is_tess_inner) {
				LLVMBuildStore(ctx->ac.builder, value, /* outer */
					       ctx->invoc0_tess_factors[chan]);
			} else if (chan < 2) {
				LLVMBuildStore(ctx->ac.builder, value, /* inner */
					       ctx->invoc0_tess_factors[4 + chan]);
			}
		}
	}

	if (writemask == 0xF && !is_tess_factor) {
		LLVMValueRef value = ac_build_gather_values(&ctx->ac,
		                                            values, 4);
		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
					    base, 0, ac_glc);
	}
}

static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
{
	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
	LLVMValueRef coord[4] = {
		ac_get_arg(&ctx->ac, ctx->tes_u),
		ac_get_arg(&ctx->ac, ctx->tes_v),
		ctx->ac.f32_0,
		ctx->ac.f32_0
	};

	/* For triangles, the vector should be (u, v, 1-u-v). */
	if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
	    PIPE_PRIM_TRIANGLES) {
		coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
					 LLVMBuildFAdd(ctx->ac.builder,
						       coord[0], coord[1], ""), "");
	}
	return ac_build_gather_values(&ctx->ac, coord, 4);
}

static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
				    unsigned semantic_name)
{
	LLVMValueRef base, addr;

	int param = si_shader_io_get_unique_index_patch(semantic_name, 0);

	base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
	addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
					  LLVMConstInt(ctx->ac.i32, param, 0));

	return buffer_load(ctx, ctx->ac.f32,
			   ~0, ctx->tess_offchip_ring, base, addr, true);

}

static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx,
					    unsigned semantic_name)
{
	LLVMValueRef buf, slot, val[4];
	int i, offset;

	slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
	buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
	buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
	offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;

	for (i = 0; i < 4; i++)
		val[i] = si_buffer_load_const(ctx, buf,
					      LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
	return ac_build_gather_values(&ctx->ac, val, 4);
}

static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
				       unsigned varying_id,
				       bool load_default_state)
{
	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
	unsigned semantic_name;

	if (load_default_state) {
		switch (varying_id) {
		case VARYING_SLOT_TESS_LEVEL_INNER:
			semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
			break;
		case VARYING_SLOT_TESS_LEVEL_OUTER:
			semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
			break;
		default:
			unreachable("unknown tess level");
		}
		return load_tess_level_default(ctx, semantic_name);
	}

	switch (varying_id) {
	case VARYING_SLOT_TESS_LEVEL_INNER:
		semantic_name = TGSI_SEMANTIC_TESSINNER;
		break;
	case VARYING_SLOT_TESS_LEVEL_OUTER:
		semantic_name = TGSI_SEMANTIC_TESSOUTER;
		break;
	default:
		unreachable("unknown tess level");
	}

	return load_tess_level(ctx, semantic_name);

}

static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
{
	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
	if (ctx->type == PIPE_SHADER_TESS_CTRL)
		return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
	else if (ctx->type == PIPE_SHADER_TESS_EVAL)
		return get_num_tcs_out_vertices(ctx);
	else
		unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
}

/**
 * Forward all outputs from the vertex shader to the TES. This is only used
 * for the fixed function TCS.
 */
static void si_copy_tcs_inputs(struct si_shader_context *ctx)
{
	LLVMValueRef invocation_id, buffer, buffer_offset;
	LLVMValueRef lds_vertex_stride, lds_base;
	uint64_t inputs;

	invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
	buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
	buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);

	lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
	lds_base = get_tcs_in_current_patch_offset(ctx);
	lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
				 lds_base);

	inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
	while (inputs) {
		unsigned i = u_bit_scan64(&inputs);

		LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
		                            LLVMConstInt(ctx->ac.i32, 4 * i, 0),
		                             "");

		LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
					      get_rel_patch_id(ctx),
		                              invocation_id,
		                              LLVMConstInt(ctx->ac.i32, i, 0));

		LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);

		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
					    buffer_offset, 0, ac_glc);
	}
}

static void si_write_tess_factors(struct si_shader_context *ctx,
				  LLVMValueRef rel_patch_id,
				  LLVMValueRef invocation_id,
				  LLVMValueRef tcs_out_current_patch_data_offset,
				  LLVMValueRef invoc0_tf_outer[4],
				  LLVMValueRef invoc0_tf_inner[2])
{
	struct si_shader *shader = ctx->shader;
	unsigned tess_inner_index, tess_outer_index;
	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
	LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
	unsigned stride, outer_comps, inner_comps, i, offset;

	/* Add a barrier before loading tess factors from LDS. */
	if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
		si_llvm_emit_barrier(ctx);

	/* Do this only for invocation 0, because the tess levels are per-patch,
	 * not per-vertex.
	 *
	 * This can't jump, because invocation 0 executes this. It should
	 * at least mask out the loads and stores for other invocations.
	 */
	ac_build_ifcc(&ctx->ac,
		      LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
				    invocation_id, ctx->ac.i32_0, ""), 6503);

	/* Determine the layout of one tess factor element in the buffer. */
	switch (shader->key.part.tcs.epilog.prim_mode) {
	case PIPE_PRIM_LINES:
		stride = 2; /* 2 dwords, 1 vec2 store */
		outer_comps = 2;
		inner_comps = 0;
		break;
	case PIPE_PRIM_TRIANGLES:
		stride = 4; /* 4 dwords, 1 vec4 store */
		outer_comps = 3;
		inner_comps = 1;
		break;
	case PIPE_PRIM_QUADS:
		stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
		outer_comps = 4;
		inner_comps = 2;
		break;
	default:
		assert(0);
		return;
	}

	for (i = 0; i < 4; i++) {
		inner[i] = LLVMGetUndef(ctx->ac.i32);
		outer[i] = LLVMGetUndef(ctx->ac.i32);
	}

	if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
		/* Tess factors are in VGPRs. */
		for (i = 0; i < outer_comps; i++)
			outer[i] = out[i] = invoc0_tf_outer[i];
		for (i = 0; i < inner_comps; i++)
			inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
	} else {
		/* Load tess_inner and tess_outer from LDS.
		 * Any invocation can write them, so we can't get them from a temporary.
		 */
		tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
		tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);

		lds_base = tcs_out_current_patch_data_offset;
		lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
					 LLVMConstInt(ctx->ac.i32,
						      tess_inner_index * 4, 0), "");
		lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
					 LLVMConstInt(ctx->ac.i32,
						      tess_outer_index * 4, 0), "");

		for (i = 0; i < outer_comps; i++) {
			outer[i] = out[i] =
				lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
		}
		for (i = 0; i < inner_comps; i++) {
			inner[i] = out[outer_comps+i] =
				lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
		}
	}

	if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
		/* For isolines, the hardware expects tess factors in the
		 * reverse order from what NIR specifies.
		 */
		LLVMValueRef tmp = out[0];
		out[0] = out[1];
		out[1] = tmp;
	}

	/* Convert the outputs to vectors for stores. */
	vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
	vec1 = NULL;

	if (stride > 4)
		vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);

	/* Get the buffer. */
	buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);

	/* Get the offset. */
	tf_base = ac_get_arg(&ctx->ac,
			     ctx->tcs_factor_offset);
	byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
				  LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");

	ac_build_ifcc(&ctx->ac,
		      LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
				    rel_patch_id, ctx->ac.i32_0, ""), 6504);

	/* Store the dynamic HS control word. */
	offset = 0;
	if (ctx->screen->info.chip_class <= GFX8) {
		ac_build_buffer_store_dword(&ctx->ac, buffer,
					    LLVMConstInt(ctx->ac.i32, 0x80000000, 0),
					    1, ctx->ac.i32_0, tf_base,
					    offset, ac_glc);
		offset += 4;
	}

	ac_build_endif(&ctx->ac, 6504);

	/* Store the tessellation factors. */
	ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
				    MIN2(stride, 4), byteoffset, tf_base,
				    offset, ac_glc);
	offset += 16;
	if (vec1)
		ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
					    stride - 4, byteoffset, tf_base,
					    offset, ac_glc);

	/* Store the tess factors into the offchip buffer if TES reads them. */
	if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
		LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
		LLVMValueRef tf_inner_offset;
		unsigned param_outer, param_inner;

		buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
		base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);

		param_outer = si_shader_io_get_unique_index_patch(
				      TGSI_SEMANTIC_TESSOUTER, 0);
		tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
					LLVMConstInt(ctx->ac.i32, param_outer, 0));

		unsigned outer_vec_size =
			ac_has_vec3_support(ctx->screen->info.chip_class, false) ?
				outer_comps : util_next_power_of_two(outer_comps);
		outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);

		ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
					    outer_comps, tf_outer_offset,
					    base, 0, ac_glc);
		if (inner_comps) {
			param_inner = si_shader_io_get_unique_index_patch(
					      TGSI_SEMANTIC_TESSINNER, 0);
			tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
					LLVMConstInt(ctx->ac.i32, param_inner, 0));

			inner_vec = inner_comps == 1 ? inner[0] :
				    ac_build_gather_values(&ctx->ac, inner, inner_comps);
			ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
						    inner_comps, tf_inner_offset,
						    base, 0, ac_glc);
		}
	}

	ac_build_endif(&ctx->ac, 6503);
}

/* This only writes the tessellation factor levels. */
static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
				      unsigned max_outputs,
				      LLVMValueRef *addrs)
{
	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
	LLVMBuilderRef builder = ctx->ac.builder;
	LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;

	si_copy_tcs_inputs(ctx);

	rel_patch_id = get_rel_patch_id(ctx);
	invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
	tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);

	if (ctx->screen->info.chip_class >= GFX9) {
		LLVMBasicBlockRef blocks[2] = {
			LLVMGetInsertBlock(builder),
			ctx->merged_wrap_if_entry_block
		};
		LLVMValueRef values[2];

		ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);

		values[0] = rel_patch_id;
		values[1] = LLVMGetUndef(ctx->ac.i32);
		rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);

		values[0] = tf_lds_offset;
		values[1] = LLVMGetUndef(ctx->ac.i32);
		tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);

		values[0] = invocation_id;
		values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
		invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
	}

	/* Return epilog parameters from this function. */
	LLVMValueRef ret = ctx->return_value;
	unsigned vgpr;

	if (ctx->screen->info.chip_class >= GFX9) {
		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
					  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
		ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
					  8 + GFX9_SGPR_TCS_OUT_LAYOUT);
		/* Tess offchip and tess factor offsets are at the beginning. */
		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
		ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
		vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
	} else {
		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
					  GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
		ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
					  GFX6_SGPR_TCS_OUT_LAYOUT);
		/* Tess offchip and tess factor offsets are after user SGPRs. */
		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset,
					  GFX6_TCS_NUM_USER_SGPR);
		ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset,
					  GFX6_TCS_NUM_USER_SGPR + 1);
		vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
	}

	/* VGPRs */
	rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
	invocation_id = ac_to_float(&ctx->ac, invocation_id);
	tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);

	/* Leave a hole corresponding to the two input VGPRs. This ensures that
	 * the invocation_id output does not alias the tcs_rel_ids input,
	 * which saves a V_MOV on gfx9.
	 */
	vgpr += 2;

	ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
	ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");

	if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
		vgpr++; /* skip the tess factor LDS offset */
		for (unsigned i = 0; i < 6; i++) {
			LLVMValueRef value =
				LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
			value = ac_to_float(&ctx->ac, value);
			ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
		}
	} else {
		ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
	}
	ctx->return_value = ret;
}

/* Pass TCS inputs from LS to TCS on GFX9. */
static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
{
	LLVMValueRef ret = ctx->return_value;

	ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
	ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
	ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
	ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
	ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
	ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);

	ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
				  8 + SI_SGPR_RW_BUFFERS);
	ret = si_insert_input_ptr(ctx, ret,
				  ctx->bindless_samplers_and_images,
				  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);

	ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits,
				  8 + SI_SGPR_VS_STATE_BITS);

	ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
				  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
	ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets,
				  8 + GFX9_SGPR_TCS_OUT_OFFSETS);
	ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
				  8 + GFX9_SGPR_TCS_OUT_LAYOUT);

	unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
	ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
				   ac_to_float(&ctx->ac,
					       ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
				   vgpr++, "");
	ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
				   ac_to_float(&ctx->ac,
					       ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
				   vgpr++, "");
	ctx->return_value = ret;
}

void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
			      LLVMValueRef *addrs)
{
	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
	struct si_shader *shader = ctx->shader;
	struct si_shader_info *info = &shader->selector->info;
	unsigned i, chan;
	LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
	LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
	LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
						 vertex_dw_stride, "");

	/* Write outputs to LDS. The next shader (TCS aka HS) will read
	 * its inputs from it. */
	for (i = 0; i < info->num_outputs; i++) {
		unsigned name = info->output_semantic_name[i];
		unsigned index = info->output_semantic_index[i];

		/* The ARB_shader_viewport_layer_array spec contains the
		 * following issue:
		 *
		 *    2) What happens if gl_ViewportIndex or gl_Layer is
		 *    written in the vertex shader and a geometry shader is
		 *    present?
		 *
		 *    RESOLVED: The value written by the last vertex processing
		 *    stage is used. If the last vertex processing stage
		 *    (vertex, tessellation evaluation or geometry) does not
		 *    statically assign to gl_ViewportIndex or gl_Layer, index
		 *    or layer zero is assumed.
		 *
		 * So writes to those outputs in VS-as-LS are simply ignored.
		 */
		if (name == TGSI_SEMANTIC_LAYER ||
		    name == TGSI_SEMANTIC_VIEWPORT_INDEX)
			continue;

		int param = si_shader_io_get_unique_index(name, index, false);
		LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
					LLVMConstInt(ctx->ac.i32, param * 4, 0), "");

		for (chan = 0; chan < 4; chan++) {
			if (!(info->output_usagemask[i] & (1 << chan)))
				continue;

			lshs_lds_store(ctx, chan, dw_addr,
				  LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
		}
	}

	if (ctx->screen->info.chip_class >= GFX9)
		si_set_ls_return_value_for_tcs(ctx);
}

/**
 * Compile the TCS epilog function. This writes tesselation factors to memory
 * based on the output primitive type of the tesselator (determined by TES).
 */
void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
			      union si_shader_part_key *key)
{
	memset(&ctx->args, 0, sizeof(ctx->args));

	if (ctx->screen->info.chip_class >= GFX9) {
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
			   &ctx->tcs_offchip_offset);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
			   &ctx->tcs_factor_offset);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
			   &ctx->tcs_offchip_layout);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
			   &ctx->tcs_out_lds_layout);
	} else {
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
			   &ctx->tcs_offchip_layout);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
			   &ctx->tcs_out_lds_layout);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
			   &ctx->tcs_offchip_offset);
		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
			   &ctx->tcs_factor_offset);
	}

	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
	struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
	struct ac_arg invocation_id; /* invocation ID within the patch */
	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
	struct ac_arg tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
		   &tcs_out_current_patch_data_offset);

	struct ac_arg tess_factors[6];
	for (unsigned i = 0; i < 6; i++)
		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);

	/* Create the function. */
	si_llvm_create_func(ctx, "tcs_epilog", NULL, 0,
			    ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
	ac_declare_lds_as_pointer(&ctx->ac);

	LLVMValueRef invoc0_tess_factors[6];
	for (unsigned i = 0; i < 6; i++)
		invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);

	si_write_tess_factors(ctx,
			      ac_get_arg(&ctx->ac, rel_patch_id),
			      ac_get_arg(&ctx->ac, invocation_id),
			      ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
			      invoc0_tess_factors, invoc0_tess_factors + 4);

	LLVMBuildRetVoid(ctx->ac.builder);
}

void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
{
	ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
	ctx->abi.load_tess_level = si_load_tess_level;
	ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
	ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
	ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
}

void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
{
	ctx->abi.load_tess_varyings = si_nir_load_input_tes;
	ctx->abi.load_tess_coord = si_load_tess_coord;
	ctx->abi.load_tess_level = si_load_tess_level;
	ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;

	if (ctx->shader->key.as_es)
		ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
	else if (ngg_cull_shader)
		ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
	else if (ctx->shader->key.as_ngg)
		ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
	else
		ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
}
